In [118]:
import lifelines
import pymc as pm
from pyBMA.CoxPHFitter import CoxPHFitter
import matplotlib.pyplot as plt
import numpy as np
from math import log
import pandas as pd
%matplotlib inline

The first step in any data analysis is acquiring and munging the data An example data set can be found HERE

Download the file output.txt and transform it into a format like below where the event column should be 0 if there's only one entry for an id, and 1 if there are two entries:

id,time_to_convert,age,male,event


In [148]:
#Solution to part one:
def convert_to_minutes(dt):
    day_diff = dt / np.timedelta64(1, 'D')
    if day_diff == 0:
        return 23.0
    else: 
        return day_diff
base_df = pd.read_csv("E:/output.txt")
base_df["time_to_convert"] = pd.to_datetime(base_df['datetime'])
base_df = base_df.drop('datetime', 1)
time_deltas = base_df.groupby(by = "id").max() - base_df.groupby(by = "id").min()
df = time_deltas["time_to_convert"].apply(convert_to_minutes).to_frame()
grouped_base = base_df.groupby(by = "id").max()
df["age"] = grouped_base["age"]
df["male"] = grouped_base["male"]
df["search"] = grouped_base["search"]
df["brand"] = grouped_base["brand"]
df["event"] = df["time_to_convert"] == 23.0
df["event"] = df["event"].apply(lambda x: 0 if x else 1)
df["time_to_convert"].median()


Out[148]:
12.829432174733796

In [ ]:
###Parametric Bayes
#Shout out to Cam Davidson-Pilon

In [136]:
## Example fully worked model using toy data
## Adapted from http://blog.yhat.com/posts/estimating-user-lifetimes-with-pymc.html

alpha = pm.Uniform("alpha", 0,20) 
beta = pm.Uniform("beta", 0,20) 
obs = pm.Weibull('obs', alpha, beta, value = df["time_to_convert"], observed = True )
obs.random
@pm.potential
def censorfactor(obs=obs): 
    if np.any(obs>23 ): 
        return -100000
    else:
        return 0

mcmc = pm.MCMC([alpha, beta, obs, censorfactor ] )
mcmc.sample(5000, burn = 0, thin = 1)


 [-----------------100%-----------------] 5000 of 5000 complete in 4.0 sec

In [137]:
pm.Matplot.plot(mcmc)


Plotting beta
Plotting alpha

Problems:

1 - Work out the mean observed time to convert
2 - Try to fit your data from section 1 
3 - Use the results to plot the distribution of the median
4 - Try adjusting the number of samples, the burn parameter and the amount of thinning to correct get good answers
5 - Try adjusting the prior and see how it affects the estimate
6 - Try to fit a different distribution to the data
7 - Compare answers
Bonus - test the hypothesis that the true median is greater than a certain amount

For question 2, note that the median of a Weibull is:

$$β(log 2)^{1/α}$$

In [141]:
#Solution to question 4:
def weibull_median(alpha, beta):
    return beta * ((log(2)) ** ( 1 / alpha))
plt.hist([weibull_median(x[0], x[1]) for x in zip(mcmc.trace("alpha"), mcmc.trace("beta"))])


Out[141]:
(array([   7.,   18.,   87.,  167.,  252.,  236.,  151.,   59.,   20.,    3.]),
 array([ 7.2924607 ,  7.34613157,  7.39980245,  7.45347332,  7.5071442 ,
         7.56081507,  7.61448595,  7.66815682,  7.7218277 ,  7.77549857,
         7.82916945]),
 <a list of 10 Patch objects>)

In [139]:
#Solution to question 4:
### Increasing the burn parameter allows us to discard results before convergence
### Thinning the results removes autocorrelation

mcmc = pm.MCMC([alpha, beta, obs, censorfactor ] )
mcmc.sample(50000, burn = 30000, thin = 20)

pm.Matplot.plot(mcmc)


 [-----------------100%-----------------] 50000 of 50000 complete in 48.3 secPlotting beta
Plotting alpha

In [126]:
#Solution to Q5
## Adjusting the priors impacts the overall result
## If we give a looser, less informative prior then we end up with a broader, shorter distribution
## If we give much more informative priors, then we get a tighter, taller distribution


## Note the narrowing of the prior
#alpha = pm.Uniform("alpha", 2.5, 4.5) 
#beta = pm.Uniform("beta", 14, 15) 

####Uncomment this to see the result of looser priors
alpha = pm.Uniform("alpha", 0, 30) 
beta = pm.Uniform("beta", 0, 30) 

obs = pm.Weibull( 'obs', alpha, beta, value = df["time_to_convert"], observed = True )

mcmc = pm.MCMC([alpha, beta, obs, censorfactor ] )
mcmc.sample(10000, burn = 100, thin = 20)
pm.Matplot.plot(mcmc)
#plt.hist([weibull_median(x[0], x[1]) for x in zip(mcmc.trace("alpha"), mcmc.trace("beta"))])


 [-----------------100%-----------------] 10000 of 10000 complete in 3.8 secPlotting beta
Plotting alpha

In [127]:
#Solution to Q6
## To fit a new distribution to the data, we need to change the obs variable to a new distribution.
## This requires creating new hyper parameters,
## The normal distribution has two parameters, the mean and the stdev (we use 1/stdev in pymc)

import pymc as mc
import numpy.ma as ma

#this begins the model 
alpha = pm.Uniform("mean", 0,15) 
tau = pm.Uniform("tau", 0, 3)
obs = pm.Normal( 'obs', alpha, tau, value = df["time_to_convert"], observed = True )

mcmc = pm.MCMC([alpha, tau, obs, censorfactor ] )
mcmc.sample(50000, burn = 30000, thin = 20)
#pm.Matplot.plot(mcmc)
plt.hist(mcmc.trace("mean")[:])


 [-----------------100%-----------------] 50000 of 50000 complete in 7.2 sec
Out[127]:
(array([   2.,   33.,   81.,  198.,  253.,  252.,  126.,   44.,    9.,    2.]),
 array([  9.91766868,   9.99304324,  10.06841781,  10.14379237,
         10.21916693,  10.2945415 ,  10.36991606,  10.44529063,
         10.52066519,  10.59603975,  10.67141432]),
 <a list of 10 Patch objects>)

In [128]:
## Solution to bonus
## Super easy to do in the Bayesian framework, all we need to do is look at what % of samples
## meet our criteria

testing_value = 10
number_of_greater_samples = sum([x >= testing_value for x in mcmc.trace("mean")[:]])
100 * (number_of_greater_samples / len(mcmc.trace("mean")[:]))


Out[128]:
99.700000000000003

In [ ]:
#Cox model

If we want to look at covariates, we need a new approach. We'll use Cox proprtional hazards. More information here.


In [149]:
#Fitting solution
cf = lifelines.CoxPHFitter()
cf.fit(df, 'time_to_convert', event_col = 'event')
cf.summary


C:\Users\j.coltman\AppData\Local\Continuum\Anaconda3\lib\site-packages\lifelines\fitters\coxph_fitter.py:285: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df.sort(duration_col, inplace=True)
Out[149]:
coef exp(coef) se(coef) z p lower 0.95 upper 0.95
age 0.033443 1.034009 0.023445 1.426470 0.153733 -0.012518 0.079405
male 2.177676 8.825771 0.051060 42.649557 0.000000 2.077579 2.277773
search -2.051362 0.128560 0.047960 -42.771943 0.000000 -2.145383 -1.957341
brand 0.012488 1.012567 0.023274 0.536586 0.591554 -0.033138 0.058115

Once we've fit the data, we need to do something useful with it. Try to do the following things:

1 - Plot the baseline survival function

2 - Predict the functions for a particular set of regressors

3 - Plot the survival function for two different set of regressors

4 - For your results in part 3 caculate how much more likely a death event is for one than the other for a given period of time

In [150]:
#Solution to 1
fig, axis = plt.subplots(nrows=1, ncols=1, sharex=True, sharey = True)
cf.baseline_survival_.plot(ax = axis, title = "Baseline Survival")


Out[150]:
<matplotlib.axes._subplots.AxesSubplot at 0xbe72850198>

In [151]:
# Solution to prediction
regressors = np.array([[45,0,0,0]])
survival = cf.predict_survival_function(regressors)
survival


Out[151]:
0
event_at
0.000000 1.000000
1.003506 0.999975
1.003550 0.999951
1.006204 0.999926
1.006625 0.999902
1.009505 0.999877
1.010444 0.999852
1.012825 0.999827
1.012875 0.999802
1.013797 0.999777
1.014682 0.999753
1.015168 0.999728
1.015346 0.999702
1.017288 0.999677
1.018251 0.999652
1.023922 0.999627
1.023941 0.999602
1.026372 0.999576
1.027967 0.999551
1.030997 0.999526
1.031851 0.999500
1.033067 0.999475
1.035437 0.999449
1.037895 0.999423
1.046156 0.999398
1.046392 0.999372
1.049333 0.999346
1.050559 0.999320
1.051622 0.999294
1.055779 0.999268
... ...
21.503143 0.029903
21.507114 0.029371
21.511207 0.028849
21.522724 0.028336
21.528409 0.027822
21.539483 0.027318
21.566481 0.026823
21.583204 0.026336
21.586815 0.025859
21.592842 0.025389
21.608974 0.024929
21.619958 0.024476
21.625831 0.024032
21.633895 0.023595
21.682726 0.023167
21.741672 0.022746
21.741723 0.022332
21.749161 0.021926
21.781097 0.021527
21.847043 0.021136
21.850629 0.020751
21.852741 0.020373
21.918205 0.020002
21.998203 0.019638
22.056664 0.019280
22.056797 0.018929
22.335316 0.018584
22.478930 0.018245
22.704566 0.017913
23.000000 0.017913

1910 rows × 1 columns


In [156]:
#Solution to plotting multiple regressors
fig, axis = plt.subplots(nrows=1, ncols=1, sharex=True)
regressor1 = np.array([[18,0,0,1]])
regressor2 = np.array([[56,0,0,1]])
survival_1 = cf.predict_survival_function(regressor1)
survival_2 = cf.predict_survival_function(regressor2)
plt.plot(survival_1,label = "32 year old male")
plt.plot(survival_2,label = "46 year old female")
plt.legend(loc = "lower left")


Out[156]:
<matplotlib.legend.Legend at 0xbe740149e8>

In [158]:
#Difference in survival 
odds = survival_2 / survival_1
plt.plot(odds, c = "red")


Out[158]:
[<matplotlib.lines.Line2D at 0xbe73c4bd30>]

Model selection

Difficult to do with classic tools (here)

Problem:

1 - Calculate the BMA coefficient values

2 - Compare these results to past the lifelines results

3 - Try running with different priors

In [ ]:
##Solution to 1
bmaCox = CoxPHFitter()
bmaCox.fit(df, "time_to_convert", event_col= "event", priors= [0.5]*7)

In [ ]:
print(coefficient)

In [ ]:
difference = coefficient - cf.summary["coef"]
print(difference)
print("Most very close, some significantly out (because no leaps and jumps)")